setwd("/home/rusen/R/wvs")

library(lme4)
library(lmerTest)

load("wvs6.rdata")

#missing values (coded with negative integers in the dataset)
#transformed into NA values
WV6_Data_R$V11[WV6_Data_R$V11<0] <- NA
WV6_Data_R$V55[WV6_Data_R$V55<0] <- NA
WV6_Data_R$V56[WV6_Data_R$V56<0] <- NA
WV6_Data_R$V57[WV6_Data_R$V57<0] <- NA
WV6_Data_R$V96[WV6_Data_R$V96<0] <- NA
WV6_Data_R$V115[WV6_Data_R$V115<0] <- NA
WV6_Data_R$V229[WV6_Data_R$V229<0] <- NA
WV6_Data_R$V238[WV6_Data_R$V238<0] <- NA
WV6_Data_R$V239[WV6_Data_R$V239<0] <- NA
WV6_Data_R$V240[WV6_Data_R$V240<0] <- NA
WV6_Data_R$V242[WV6_Data_R$V242<0] <- NA
WV6_Data_R$V248[WV6_Data_R$V248<0] <- NA

#country-level data taken from a separate file
countryVars <- read.csv("GDP.csv")
countryVars$Country <- as.character(countryVars$Country)

#a new country identification variable and
#a new variable for GDP per capita created
WV6_Data_R$ctr <- WV6_Data_R$V2
WV6_Data_R$gdp <- WV6_Data_R$V2

#values retrieved by matching country ID no's
for (i in 1:60) {
  WV6_Data_R$ctr[WV6_Data_R$V2==countryVars$CountryIDno[i]] <- countryVars$Country[i]
  WV6_Data_R$gdp[WV6_Data_R$V2==countryVars$CountryIDno[i]] <- countryVars$GDPpcPPP[i]
}
rm(i)

#categorical variables defined as factor
#levels redefined where necessary
#reference levels changed where necessary

WV6_Data_R$ctrF <- factor(WV6_Data_R$ctr)

WV6_Data_R$V11F <- factor(WV6_Data_R$V11, labels=c("V. good", "Good",
                                                   "Fair", "Poor"))
WV6_Data_R$V11F <- relevel(WV6_Data_R$V11F, "Good")

#relationship status redefined as living with a partner
WV6_Data_R$V57.01 <- rep(NA, times=length(WV6_Data_R$V57))
WV6_Data_R$V57.01[WV6_Data_R$V57<3] <- 1
WV6_Data_R$V57.01[WV6_Data_R$V57>2] <- 0
WV6_Data_R$V57.01F <- factor(WV6_Data_R$V57.01, labels=c("No", "Yes"))
WV6_Data_R$V57.01F <- relevel(WV6_Data_R$V57.01F, "Yes")

#trust in political institutions
WV6_Data_R$V115F <- factor(WV6_Data_R$V115, labels=c("Great deal", "Quite",
                                                     "Not much", "None"))
WV6_Data_R$V115F <- relevel(WV6_Data_R$V115F, "Not much")

#employment status
WV6_Data_R$V229F <- factor(WV6_Data_R$V229, labels=c("Full time", "Part time",
                                                     "Self employed", "Retired", 
                                                     "Housewife", "Students", 
                                                     "Unemployed", "Other"))
WV6_Data_R$V229F <- relevel(WV6_Data_R$V229F, "Full time")

#social class
WV6_Data_R$V238F <- factor(WV6_Data_R$V238, labels=c("Upper", "Upper middle",
                                                     "Lower middle", "Working", 
                                                     "Lower"))
WV6_Data_R$V238F <- relevel(WV6_Data_R$V238F, "Lower middle")

#gender
WV6_Data_R$V240F <- factor(WV6_Data_R$V240, labels=c("Male", "Female"))
WV6_Data_R$V240F <- relevel(WV6_Data_R$V240F, "Female")

#education
WV6_Data_R$V248F <- factor(WV6_Data_R$V248, labels=c("No formal ed.", "Inc. primary",
                                                     "Comp. primary", "Inc. secondary1", 
                                                     "Comp. secondary1", "Inc. secondary2",
                                                     "Comp. secondary2", "Inc. uni", 
                                                     "Comp. uni"))
#education levels are reduced to four categories for simplicity
WV6_Data_R$V248.4L <- rep(NA, times=length(WV6_Data_R$V248))
WV6_Data_R$V248.4L[WV6_Data_R$V248<3] <- 1
WV6_Data_R$V248.4L[WV6_Data_R$V248==3|WV6_Data_R$V248==4|WV6_Data_R$V248==6] <- 2
WV6_Data_R$V248.4L[WV6_Data_R$V248==5|WV6_Data_R$V248==7|WV6_Data_R$V248==8] <- 3
WV6_Data_R$V248.4L[WV6_Data_R$V248==9] <- 4
WV6_Data_R$V248.4LF <- factor(WV6_Data_R$V248.4L, labels=c("Below primary", "Primary", 
                                                           "Secondary", "Higher"))
WV6_Data_R$V248.4LF <- relevel(WV6_Data_R$V248.4LF, "Secondary")

attach(WV6_Data_R)

#each variable is assigned to a new vector
#to create a smaller dataset for analysis
health <- V11F
WB <- V23
partner <- V57.01F
govconf <- V115F
employ <- V229F
class <- V238F
income <- V239
sex <- V240F
age <- V242
age.sq <- V242^2
educ <- V248.4LF
country <- ctrF
lgdp <- log(gdp)
trust <- V56
auton <- V55

#vectors are combined into a dataset with standardized numeric variables
data.raw.cont <- cbind.data.frame(WB, income, age, age.sq, gdp, lgdp, trust, auton)
data.raw.factor <- cbind.data.frame(health, class, employ, educ, sex, partner, country, govconf)
data.scaled.cont <- scale(data.raw.cont)
data.scaled <- cbind.data.frame(data.scaled.cont, data.raw.factor)
data.final <- data.scaled[complete.cases(data.scaled), ]

detach(WV6_Data_R)
rm(data.raw.cont, data.raw.factor, data.scaled, data.scaled.cont, 
   age, age.sq, auton, class, country, educ, employ, govconf, 
   health, income, lgdp, partner, sex, trust, WB)

### models ###

#empty variance component model
VCmodel <- lmer(WB ~ (1 | country), data=data.final, REML=F)
summary(VCmodel)

#random intercept model with individual level variables
RImodel1 <- lmer(WB ~ income + 
                   health + employ + educ + class + auton + trust + govconf +
                   sex + partner + age + age.sq + 
                   (1 | country), data=data.final, REML=F)
summary(RImodel1)

anova(VCmodel, RImodel1)

#random intercept model with the addition of country level variables
RImodel2 <- lmer(WB ~ income + lgdp + 
                   health + employ + educ + class + auton + trust + govconf +
                   sex + partner + age + age.sq + 
                   (1 | country), data=data.final, REML=F)
summary(RImodel2)

anova(RImodel1, RImodel2)

#random intercept and slope (income) model
RSImodel <- lmer(WB ~ income + lgdp + 
                   health + employ + educ + class + auton + trust + govconf +
                   sex + partner + age + age.sq + 
                   (1 + income | country), data=data.final, REML=F)
summary(RSImodel)

anova(RImodel2, RSImodel)

#cross-level interaction model
CLImodel <- lmer(WB ~ income * lgdp + 
                   health + employ + educ + class + auton + trust + govconf +
                   sex + partner + age + age.sq + 
                   (1 + income | country), data=data.final, REML=F)
summary(CLImodel)

anova(RSImodel, CLImodel)

#coefficients from the second and fourth models is retrieved
RI.coefs <- coef(RImodel1)$country
RSI.coefs <- coef(RSImodel)$country
CLI.coefs <- coef(CLImodel)$country

#new variable for GDP per capita raw values
RI.coefs$GDPraw <- RI.coefs$lgdp
RSI.coefs$GDPraw <- RSI.coefs$lgdp
CLI.coefs$GDPraw <- CLI.coefs$lgdp

#real GDP per capita values are retrieved
for (i in 1:60) {
  RI.coefs$GDPraw[row.names(RI.coefs)==countryVars$Country[i]] <- countryVars$GDPpcPPP[i]
  RSI.coefs$GDPraw[row.names(RSI.coefs)==countryVars$Country[i]] <- countryVars$GDPpcPPP[i]
  CLI.coefs$GDPraw[row.names(CLI.coefs)==countryVars$Country[i]] <- countryVars$GDPpcPPP[i]
  }
rm(i)

#GDP per capita on log and standardized scale
RI.coefs$logGDP <- log(RI.coefs$GDPraw)
RI.coefs$logGDP.sc <- scale(RI.coefs$logGDP)
RSI.coefs$logGDP <- log(RSI.coefs$GDPraw)
RSI.coefs$logGDP.sc <- scale(RSI.coefs$logGDP)
CLI.coefs$logGDP <- log(CLI.coefs$GDPraw)
CLI.coefs$logGDP.sc <- scale(CLI.coefs$logGDP)

#turn matrices into data frame
RI.coef.df <- data.frame(RI.coefs)
RSI.coef.df <- data.frame(RSI.coefs)
CLI.coef.df <- data.frame(CLI.coefs)
rm(RI.coefs,RSI.coefs,CLI.coefs)

#predicting random intercepts by GDP
intVgdp <- lm(scale(X.Intercept.) ~ logGDP.sc, data=RI.coef.df)
summary(intVgdp)

#predicting random slopes by GDP
coefVgdp <- lm(scale(income) ~ logGDP.sc, data=RSI.coef.df)
summary(coefVgdp)


## plots ## 

require(ggplot2)

#random intercepts
RI.plot <- 
  ggplot(RI.coef.df, aes(logGDP.sc, X.Intercept., label= rownames(RI.coef.df))) + 
    geom_point() + geom_text(angle=45, hjust = 0, nudge_x = 0.02) + 
    geom_smooth(method="lm", se=F) + 
    xlab("GDP per capita, PPP, log scale, standardized") + ylab("Random intercepts") +
    annotate("text", x=-2, y=0.7, label="y=0.10+0.08x") + 
    annotate("text", x=-2, y=0.6, label="R-sq=0.11") + 
    theme(axis.title=element_text(size=20))
RI.plot

#random slopes
RSI.plot <- 
  ggplot(RSI.coef.df, aes(logGDP.sc, income, label=rownames(RSI.coef.df))) + 
    geom_point() + geom_text(angle=45, hjust = 0, nudge_x = 0.02) + 
    geom_smooth(method="lm", se=F) + 
    xlab("GDP per capita, PPP, log scale, standardized") + 
    ylab("Random slopes of relative income") + 
    annotate("text", x=-2, y=0.065, label="y=0.15-0.03x") + 
    annotate("text", x=-2, y=0.035, label="R-sq=0.12") + 
    theme(axis.title=element_text(size=20))
RSI.plot
  
#relative income curves positioned acc. to GDP, r-intercept and r-slope

CLI.coef.df$xmid <- scale(CLI.coef.df$GDPraw)
CLI.coef.df$ymid <- CLI.coef.df$X.Intercept.+CLI.coef.df$logGDP*CLI.coef.df$lgdp
CLI.coef.df$xbeg <- CLI.coef.df$xmid-0.25
CLI.coef.df$xend <- CLI.coef.df$xmid+0.25                      
CLI.coef.df$ybeg <- CLI.coef.df$ymid-0.25*CLI.coef.df$income
CLI.coef.df$yend <- CLI.coef.df$ymid+0.25*CLI.coef.df$income

scattered.curves <- 
  ggplot(CLI.coef.df, aes(xmid, ymid, label=rownames(RSI.coef.df))) + 
    geom_text(vjust=1) + 
    stat_smooth(method="lm", formula=y~log(x+1), se=F) + 
    geom_segment(aes(x=xbeg, xend=xend, y=ybeg, yend=yend)) + 
    xlab("GDP per capita, PPP, standardized") + 
    ylab("SWB as function of random intercepts and GDP p.c.") + 
    annotate("text", x=4, y=1.28, label="y=0.92+0.08ln(x+1)") + 
    annotate("text", x=4, y=1.22, label="R-sq=0.11") + 
    theme(axis.title=element_text(size=20))
scattered.curves

summary(lm(ymid~log(xmid+1), data=CLI.coef.df))
